import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import shap
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import LinearRegression
import lime
import lime.lime_tabular
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display
# Loading data and data processing
print("Data head")
df = pd.read_csv("MEPS_data_preprocessed.csv")
df
# Data description
description = df.describe()
for d in description:
print(description[d], '\n')
# Boxplots of variables
fig, axes = plt.subplots(11, 4, figsize = (12, 48))
i = 0
for triaxis in axes:
for axis in triaxis:
df.boxplot(column = df.columns[i], ax = axis)
i = i+1
plt.savefig("boxplots")
# Y variable
df.boxplot(column="HEALTHEXP", return_type='axes')
# Data has a long tail, hence logarithmic transformation of Y
val = df['HEALTHEXP'].values
df['HEALTHEXP'] = np.log1p(val.reshape(-1,1))
# Correlation matrix
f = plt.figure(figsize=(25, 25))
plt.matshow(df.corr(), fignum=f.number)
plt.xticks(range(df.shape[1]), df.columns, fontsize=14, rotation=45)
plt.yticks(range(df.shape[1]), df.columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
# Dropping the panel number (19 or 20, not meant to be predictive)
df.drop(columns = ['PANEL'], inplace=True)
# One hot encoding of variables
cat_inx = df.nunique()[df.nunique() == 3 ].index.tolist() # only vairables with values {-1, 1, 2} which means "inapplicable, yes, no"
print(f"Categorical variables:\n {cat_inx}")
cat_inx.remove('INSCOV15')
df = pd.get_dummies(df, columns=cat_inx)
# Dividing data
x, y = df.iloc[:, df.columns != 'HEALTHEXP'] , df.iloc[:, df.columns == 'HEALTHEXP']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)
def get_model_results(model_name: str, pred_train, y_train, pred_test, y_test, print_res):
rmse_train = np.sqrt(mean_squared_error(y_train, pred_train))
r_2_train = r2_score(y_train, pred_train)
rmse_test = np.sqrt(mean_squared_error(y_test, pred_test))
r_2_test = r2_score(y_test, pred_test)
mae_test = mean_absolute_error(y_test, pred_test)
mae_train = mean_absolute_error(y_train, pred_train)
if print_res:
print(f"{model_name} results:")
print(f"training rmse: {rmse_train}\ntraining r2: {r_2_train}\ntraining mae: {mae_train}")
print(f"test rmse: {rmse_test}\ntest r2: {r_2_test}\ntest mae: {mae_test}\n")
return [rmse_train, rmse_test, r_2_train, r_2_test, mae_train, mae_test]
# XGB model
n_es = [10*i for i in range(3,15)]
depth = [i for i in range(3, 12)]
feature_names = list(x_train)
col_names_res_xgb = ['n', 'd', 'rmse_train', 'rmse_test', 'r_2_train', 'r_2_test', 'mae_train', 'mae_test']
df_res_xgb = pd.DataFrame(columns=col_names_res_xgb)
def xgb_train_and_predict(depth, n_est):
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror',
colsample_bytree = 0.4,
learning_rate = 0.2,
max_depth = d,
alpha = 15,
n_estimators = n,
feature_names=feature_names
)
xg_reg.fit(x_train.to_numpy(), y_train.to_numpy())
predictions_train = xg_reg.predict(x_train.to_numpy())
predictions_test = xg_reg.predict(x_test.to_numpy())
return xg_reg, predictions_train, predictions_test
for n in n_es:
for d in depth:
xg_reg, predictions_train, predictions_test = xgb_train_and_predict(d, n)
train_res = get_model_results(f"XGB ", predictions_train, y_train, predictions_test, y_test, False)
df_res_xgb = df_res_xgb.append(pd.Series([n,d] + train_res, index=col_names_res_xgb), ignore_index=True)
df_res_xgb.sort_values('r_2_test')
# XGB with final parameters
d = 6
n = 60
xg_reg, predictions_train, predictions_test = xgb_train_and_predict(d, n)
get_model_results(f"XGB ", predictions_train, y_train, predictions_test, y_test, True)
# Linear Model with lasso
l_reg = Lasso(alpha=0.5, max_iter=10e5)
l_reg.fit(x_train, y_train)
pred_lr_test = l_reg.predict(x_test)
pred_lr_train = l_reg.predict(x_train)
get_model_results("Lasso Regression", pred_lr_train, y_train, pred_lr_test, y_test, True)
def explain_with_lime(model, index, num_f):
explainer = lime.lime_tabular.LimeTabularExplainer(x_train.values,
feature_names=feature_names,
class_names=['HEALTHEXP'],
verbose=True,
mode='regression')
exp = explainer.explain_instance(x_test.values[index], model.predict, num_features=num_f)
exp.show_in_notebook(show_table=True)
# Selected observations and explications for xgb model
inx_for_expl = [20, 42, 49, 60]
def explain_patient(i):
print(f"Patient {inx_for_expl[i]}: true HEALTEXP {y_test.values[inx_for_expl[i]]}\n")
explain_with_lime(xg_reg, inx_for_expl[i], 6)
print("Patient description:")
for name, val in zip(feature_names, x_test.values[inx_for_expl[i]]):
print(name,": ", val)
explain_patient(0)
Model showed lower value due to the fact that the patient #20 does not have diabetes diagnoised (DIABDX_1 = 0), his race, not having joint pain in last 12 months (JTPAIN31_1 = 0), not having high blood pressure (HIBPDX_1 = 0).
explain_patient(1)
For the patient #42 given the race (RACE3), the fact has health inssurance (ISCOV15), joint pains (JTPAIN31_1) and high income (POVCAT15) the expected value for health expenses was predicted to be higher. Abscence of high blood pressure and the gender lower the final output.
When looking throug the data, from all the categorical variables which were one-hot encoded patient has high cholesterol (CHOLDX_1 : 1.0), joint pains(JTPAIN31_1 : 1.0), social limitations (SOCLIM31_1 : 1.0) and cognitive limitations (COGLIM31_1 : 1.0).
explain_patient(2)
For the patient #49 not having diabetes (DIABDX_1), his race (RACE3), not having joint pains (JTPAIN31_1), not having the high blood pressure (HIBPDX_1) and not having the high cholesterol diagnosis (CHOLDX_1) lowered the expected value for health expenses. The gender increases the value.
When looking throug the data, from all the categorical variables which were one-hot encoded patient has a positive result for being pregnant (PREGNT31_1 : 1.0) and currently smoking (ADSMOK42_1 : 1.0).
print("Patient description:")
for name, val in zip(feature_names, x_test.values[inx_for_expl[3]]):
print(name,": ", val)
print(f"Patient {inx_for_expl[i]}: true HEALTEXP {y_test.values[inx_for_expl[3]]}, XGB model \n")
explain_with_lime(xg_reg, inx_for_expl[3], 6)
print(f"Patient {inx_for_expl[i]}: true HEALTEXP {y_test.values[inx_for_expl[i]]}, Lasso model \n")
explain_with_lime(l_reg, inx_for_expl[3], 6)
XGB model predicted the value closer to real value than Lasso model. Both explications are diffrent. There is one comman argumnet in the explications, which is PCS42 (score of the physical summry).
Not having inssurance (INSCOV15), race, lack of joint pains, no diagnosis for high cholesterol and no diagnosis of high blood pressure lowered the value of the prediction in the XGB model.
Age and overall rating of the feelings (K6SUM4) lowered the price, but mental component Summary (MCS42) and not being married (MARRY31X) augmented the value.